# -------------------------------------------------------------------
# Purpose: Creates Figure B1
# Author:  Max Posch, 25/07/2025
# Usage:   Source this script to generate both panels of the figure
# -------------------------------------------------------------------
# Check that required paths exist
stopifnot(dir.exists(pdataanalysis))
stopifnot(dir.exists(pdataconfanalysis))
stopifnot(dir.exists(poutputappendix))


# Left plot

## Load data
data_mp <- fread(file.path(pdataconfanalysis, "ivSurnameStocksMetaphone1900BoundariesPanel18951940.csv"))
data_mp <- data_mp[year >= 1900]

## Create variables
temp_mp <- data_mp[, .(
    n_namelast_mp_adjp = sum(n_namelast_mp_adjp)
),
keyby = .(year, gisjoin_1900, namelast_mp)
]
temp_mp <- temp_mp[n_namelast_mp_adjp > 0]
temp_mp <- temp_mp[, ":="(
    p_namelast_mp_adjp = n_namelast_mp_adjp / sum(n_namelast_mp_adjp)),
keyby = .(year, gisjoin_1900)
]
temp_mp <- temp_mp[order(p_namelast_mp_adjp)]
temp_mp[, cumulative_share := cumsum(p_namelast_mp_adjp) / sum(p_namelast_mp_adjp)]

temp_mp[, bin := cut_width(p_namelast_mp_adjp, width = 0.0001)]
temp_mp <- temp_mp[, .(
    p_namelast_mp_adjp = mean(p_namelast_mp_adjp),
    cumulative_share = mean(cumulative_share)
), by = bin]

## Print plot
plotname <- file.path(poutputappendix, "figureB01left.pdf")
p <- ggplot(temp_mp, aes(x = p_namelast_mp_adjp * 100, y = cumulative_share * 100)) +
    geom_line() +
    scale_x_continuous(limits = c(0, 3)) +
    labs(x = "Surname population share within county (in %)", y = NULL, subtitle = "Cumulative share of surnames (in %)") +
    theme_minimal()
ggsave(plotname, p, width = 4.6, height = 4.6, units = "in")

cat("Figure B1 left saved to:", plotname, "\n")


# Right plot
load(file.path(pdataanalysis, "countyLevel19001940.RData"))
plotname <- file.path(poutputappendix, "figureB01right.pdf")
p <- countyLevel19001940 %>% 
    ggplot(aes(x = entropy_namelast_mp_adjp_w)) +
    stat_bin(aes(y = after_stat(count) / sum(after_stat(count)) * 100)) +
    labs(
        x = "Surname diversity",
        y = NULL,
        subtitle = "Share of counties (in %)"
    ) +
    theme_minimal()
ggsave(plotname, p, width = 4.6, height = 4.6, units = "in")

cat("Figure B1 right saved to:", plotname, "\n")
